Personal Computer World 2009 February

home *** CD-ROM | disk | FTP | other *** search

/ Personal Computer World 2009 February / PCWFEB09.iso / Software / Resources / Chat & Communication / Digsby build 37 / digsby_setup.exe / lib / lxml / html / diff.pyo (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2008-10-13 | 17.8 KB | 645 lines

# Source Generated with Decompyle++ # File: in.pyo (Python 2.5) import difflib from lxml import etree from lxml.html import fragment_fromstring import cgi import re __all__ = [ 'html_annotate', 'htmldiff'] try: _unicode = unicode except NameError: _unicode = str try: basestring = __builtins__['basestring'] except (KeyError, NameError): basestring = str def default_markup(text, version): return '<span title="%s">%s</span>' % (cgi.escape(_unicode(version), 1), text) def html_annotate(doclist, markup = default_markup): tokenlist = [ tokenize_annotated(doc, version) for doc, version in doclist ] cur_tokens = tokenlist[0] for tokens in tokenlist[1:]: html_annotate_merge_annotations(cur_tokens, tokens) cur_tokens = tokens cur_tokens = compress_tokens(cur_tokens) result = markup_serialize_tokens(cur_tokens, markup) return ''.join(result).strip() def tokenize_annotated(doc, annotation): tokens = tokenize(doc, include_hrefs = False) for tok in tokens: tok.annotation = annotation return tokens def html_annotate_merge_annotations(tokens_old, tokens_new): s = InsensitiveSequenceMatcher(a = tokens_old, b = tokens_new) commands = s.get_opcodes() for command, i1, i2, j1, j2 in commands: if command == 'equal': eq_old = tokens_old[i1:i2] eq_new = tokens_new[j1:j2] copy_annotations(eq_old, eq_new) continue def copy_annotations(src, dest): for src_tok, dest_tok in zip(src, dest): dest_tok.annotation = src_tok.annotation def compress_tokens(tokens): result = [ tokens[0]] for tok in tokens[1:]: if not (result[-1].post_tags) and not (tok.pre_tags) and result[-1].annotation == tok.annotation: compress_merge_back(result, tok) continue result.append(tok) return result def compress_merge_back(tokens, tok): last = tokens[-1] if type(last) is not token or type(tok) is not token: tokens.append(tok) else: text = _unicode(last) if last.trailing_whitespace: text += ' ' text += tok merged = token(text, pre_tags = last.pre_tags, post_tags = tok.post_tags, trailing_whitespace = tok.trailing_whitespace) merged.annotation = last.annotation tokens[-1] = merged def markup_serialize_tokens(tokens, markup_func): for token in tokens: for pre in token.pre_tags: yield pre html = token.html() html = markup_func(html, token.annotation) if token.trailing_whitespace: html += ' ' yield html for post in token.post_tags: yield post def htmldiff(old_html, new_html): old_html_tokens = tokenize(old_html) new_html_tokens = tokenize(new_html) result = htmldiff_tokens(old_html_tokens, new_html_tokens) result = ''.join(result).strip() return fixup_ins_del_tags(result) def htmldiff_tokens(html1_tokens, html2_tokens): s = InsensitiveSequenceMatcher(a = html1_tokens, b = html2_tokens) commands = s.get_opcodes() result = [] for command, i1, i2, j1, j2 in commands: if command == 'equal': result.extend(expand_tokens(html2_tokens[j1:j2], equal = True)) continue if command == 'insert' or command == 'replace': ins_tokens = expand_tokens(html2_tokens[j1:j2]) merge_insert(ins_tokens, result) if command == 'delete' or command == 'replace': del_tokens = expand_tokens(html1_tokens[i1:i2]) merge_delete(del_tokens, result) continue result = cleanup_delete(result) return result def expand_tokens(tokens, equal = False): for token in tokens: for pre in token.pre_tags: yield pre if not equal or not (token.hide_when_equal): if token.trailing_whitespace: yield token.html() + ' ' else: yield token.html() for post in token.post_tags: yield post def merge_insert(ins_chunks, doc): (unbalanced_start, balanced, unbalanced_end) = split_unbalanced(ins_chunks) doc.extend(unbalanced_start) if doc and not doc[-1].endswith(' '): doc[-1] += ' ' doc.append('<ins>') if balanced and balanced[-1].endswith(' '): balanced[-1] = balanced[-1][:-1] doc.extend(balanced) doc.append('</ins> ') doc.extend(unbalanced_end) class DEL_START: pass class DEL_END: pass class NoDeletes(Exception): pass def merge_delete(del_chunks, doc): doc.append(DEL_START) doc.extend(del_chunks) doc.append(DEL_END) def cleanup_delete(chunks): while None: try: (pre_delete, delete, post_delete) = split_delete(chunks) except NoDeletes: break (unbalanced_start, balanced, unbalanced_end) = split_unbalanced(delete) locate_unbalanced_end(unbalanced_end, pre_delete, post_delete) doc = pre_delete if doc and not doc[-1].endswith(' '): doc[-1] += ' ' doc.append('<del>') if balanced and balanced[-1].endswith(' '): balanced[-1] = balanced[-1][:-1] doc.extend(balanced) doc.append('</del> ') doc.extend(post_delete) chunks = doc continue return chunks def split_unbalanced(chunks): start = [] end = [] tag_stack = [] balanced = [] for chunk in chunks: if not chunk.startswith('<'): balanced.append(chunk) continue endtag = chunk[1] == '/' name = chunk.split()[0].strip('<>/') if name in empty_tags: balanced.append(chunk) continue if endtag: if tag_stack and tag_stack[-1][0] == name: balanced.append(chunk) (name, pos, tag) = tag_stack.pop() balanced[pos] = tag elif tag_stack: []([ tag for name, pos, tag in tag_stack ]) tag_stack = [] end.append(chunk) else: end.append(chunk) tag_stack[-1][0] == name tag_stack.append((name, len(balanced), chunk)) balanced.append(None) []([ chunk for name, pos, chunk in tag_stack ]) balanced = _[3] return (start, balanced, end) def split_delete(chunks): try: pos = chunks.index(DEL_START) except ValueError: raise NoDeletes pos2 = chunks.index(DEL_END) return (chunks[:pos], chunks[pos + 1:pos2], chunks[pos2 + 1:]) def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete): while not unbalanced_start: break finding = unbalanced_start[0] finding_name = finding.split()[0].strip('<>') if not post_delete: break next = post_delete[0] if next is DEL_START or not next.startswith('<'): break if next[1] == '/': break name = next.split()[0].strip('<>') if name == 'ins': break if name == finding_name: unbalanced_start.pop(0) pre_delete.append(post_delete.pop(0)) continue break continue def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete): while not unbalanced_end: break finding = unbalanced_end[-1] finding_name = finding.split()[0].strip('<>/') if not pre_delete: break next = pre_delete[-1] if next is DEL_END or not next.startswith('</'): break name = next.split()[0].strip('<>/') if name == 'ins' or name == 'del': break if name == finding_name: unbalanced_end.pop() post_delete.insert(0, pre_delete.pop()) continue break continue class token(_unicode): hide_when_equal = False def __new__(cls, text, pre_tags = None, post_tags = None, trailing_whitespace = False): obj = _unicode.__new__(cls, text) if pre_tags is not None: obj.pre_tags = pre_tags else: obj.pre_tags = [] if post_tags is not None: obj.post_tags = post_tags else: obj.post_tags = [] obj.trailing_whitespace = trailing_whitespace return obj def __repr__(self): return 'token(%s, %r, %r)' % (_unicode.__repr__(self), self.pre_tags, self.post_tags) def html(self): return _unicode(self) class tag_token(token): def __new__(cls, tag, data, html_repr, pre_tags = None, post_tags = None, trailing_whitespace = False): obj = token.__new__(cls, '%s: %s' % (type, data), pre_tags = pre_tags, post_tags = post_tags, trailing_whitespace = trailing_whitespace) obj.tag = tag obj.data = data obj.html_repr = html_repr return obj def __repr__(self): return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (self.tag, self.data, self.html_repr, self.pre_tags, self.post_tags, self.trailing_whitespace) def html(self): return self.html_repr class href_token(token): hide_when_equal = True def html(self): return 'Link: %s' % self def tokenize(html, include_hrefs = True): body_el = parse_html(html, cleanup = True) chunks = flatten_el(body_el, skip_tag = True, include_hrefs = include_hrefs) return fixup_chunks(chunks) def parse_html(html, cleanup = True): if cleanup: html = cleanup_html(html) return fragment_fromstring(html, create_parent = True) _body_re = re.compile('<body.*?>', re.I | re.S) _end_body_re = re.compile('</body.*?>', re.I | re.S) _ins_del_re = re.compile('</?(ins|del).*?>', re.I | re.S) def cleanup_html(html): match = _body_re.search(html) if match: html = html[match.end():] match = _end_body_re.search(html) if match: html = html[:match.start()] html = _ins_del_re.sub('', html) return html end_whitespace_re = re.compile('[ \\t\\n\\r]$') def fixup_chunks(chunks): tag_accum = [] cur_word = None result = [] for chunk in chunks: if isinstance(chunk, tuple): if chunk[0] == 'img': src = chunk[1] tag = chunk[2] if tag.endswith(' '): tag = tag[:-1] trailing_whitespace = True else: trailing_whitespace = False cur_word = tag_token('img', src, html_repr = tag, pre_tags = tag_accum, trailing_whitespace = trailing_whitespace) tag_accum = [] result.append(cur_word) continue if chunk[0] == 'href': href = chunk[1] cur_word = href_token(href, pre_tags = tag_accum, trailing_whitespace = True) tag_accum = [] result.append(cur_word) continue continue if is_word(chunk): if chunk.endswith(' '): chunk = chunk[:-1] trailing_whitespace = True else: trailing_whitespace = False cur_word = token(chunk, pre_tags = tag_accum, trailing_whitespace = trailing_whitespace) tag_accum = [] result.append(cur_word) continue if is_start_tag(chunk): tag_accum.append(chunk) continue if is_end_tag(chunk): if tag_accum: tag_accum.append(chunk) else: cur_word.post_tags.append(chunk) tag_accum if not result: return [ token('', pre_tags = tag_accum)] else: result[-1].post_tags.extend(tag_accum) return result empty_tags = ('param', 'img', 'area', 'br', 'basefont', 'input', 'base', 'meta', 'link', 'col') block_level_tags = ('address', 'blockquote', 'center', 'dir', 'div', 'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'isindex', 'menu', 'noframes', 'noscript', 'ol', 'p', 'pre', 'table', 'ul') block_level_container_tags = ('dd', 'dt', 'frameset', 'li', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr') def flatten_el(el, include_hrefs, skip_tag = False): if not skip_tag: if el.tag == 'img': yield ('img', el.attrib['src'], start_tag(el)) else: yield start_tag(el) if el.tag in empty_tags and not (el.text) and not len(el) and not (el.tail): return None start_words = split_words(el.text) for word in start_words: yield cgi.escape(word) for child in el: for item in flatten_el(child, include_hrefs = include_hrefs): yield item if el.tag == 'a' and el.attrib.get('href') and include_hrefs: yield ('href', el.attrib['href']) if not skip_tag: yield end_tag(el) end_words = split_words(el.tail) for word in end_words: yield cgi.escape(word) def split_words(text): if not text or not text.strip(): return [] words = [ w + ' ' for w in text.strip().split() ] return words start_whitespace_re = re.compile('^[ \\t\\n\\r]') def start_tag(el): return ''.join % ([], []([ ' %s="%s"' % (name, cgi.escape(value, True)) for name, value in el.attrib.items() ])) def end_tag(el): if el.tail and start_whitespace_re.search(el.tail): extra = ' ' else: extra = '' return '</%s>%s' % (el.tag, extra) def is_word(tok): return not tok.startswith('<') def is_end_tag(tok): return tok.startswith('</') def is_start_tag(tok): if tok.startswith('<'): pass return not tok.startswith('</') def fixup_ins_del_tags(html): doc = parse_html(html, cleanup = False) _fixup_ins_del_tags(doc) html = serialize_html_fragment(doc, skip_outer = True) return html def serialize_html_fragment(el, skip_outer = False): html = etree.tostring(el, method = 'html', encoding = _unicode) if skip_outer: html = html[html.find('>') + 1:] html = html[:html.rfind('<')] return html.strip() else: return html def _fixup_ins_del_tags(doc): for tag in [ 'ins', 'del']: for el in doc.xpath('descendant-or-self::%s' % tag): if not _contains_block_level_tag(el): continue _move_el_inside_block(el, tag = tag) el.drop_tag() def _contains_block_level_tag(el): if el.tag in block_level_tags or el.tag in block_level_container_tags: return True for child in el: if _contains_block_level_tag(child): return True continue return False def _move_el_inside_block(el, tag): for child in el: if _contains_block_level_tag(child): break continue else: import sys children_tag = etree.Element(tag) children_tag.text = el.text el.text = None el[:] = [ children_tag] return None for child in list(el): if _contains_block_level_tag(child): _move_el_inside_block(child, tag) if child.tail: tail_tag = etree.Element(tag) tail_tag.text = child.tail child.tail = None el.insert(el.index(child) + 1, tail_tag) child.tail child_tag = etree.Element(tag) el.replace(child, child_tag) child_tag.append(child) if el.text: text_tag = etree.Element(tag) text_tag.text = el.text el.text = None el.insert(0, text_tag) def _merge_element_contents(el): parent = el.getparent() if not el.text: pass text = '' if el.tail: if not len(el): text += el.tail elif el[-1].tail: el[-1].tail += el.tail else: el[-1].tail = el.tail index = parent.index(el) if text: if index == 0: previous = None else: previous = parent[index - 1] if previous is None: if parent.text: parent.text += text else: parent.text = text elif previous.tail: previous.tail += text else: previous.tail = text parent[index:index + 1] = el.getchildren() class InsensitiveSequenceMatcher(difflib.SequenceMatcher): threshold = 2 def get_matching_blocks(self): size = min(len(self.b), len(self.b)) threshold = min(self.threshold, size / 4) actual = difflib.SequenceMatcher.get_matching_blocks(self) return _[1] if __name__ == '__main__': from lxml.html import _diffcommand _diffcommand.main()